########################################
## R code


runBigMahal_BITs <- function(HRA, inData, outData){

#setwd("C:\\Users\\Richard Nielsen\\Desktop\\Papers\\Rewards for Ratification\\bits\\bitjunk")

library(foreign)
dat <- read.dta(inData, convert.underscore=T)

head(dat)

dim(dat)
table(dat$treated)

  ## Not all of the observations have all of the lags and leads
#grep("^l[12345]",names(dat))
## drop the obs that don't have all the lags
#dat <- dat[rownames(na.omit(dat[,c(grep("^l[12345]",names(dat)))])), ]

  ## New na.omit with just the lags that I'm keeping
lags <- c("mjfdi","jextract","jcorruption","jgdp","jgdpcap","jchgdplag2","jfdilag","jca_gdp","jdem","jpriv","sfdi","tradgdp")
lagnames <- paste(sort(rep(paste("l",c(2:5),sep=""),length(lags))), lags, sep="")
allvars <- c(names(dat)[grep("^l[1]",names(dat))], lagnames)
allvars <- gsub("_",".",allvars,fixed=T)
dat <- dat[rownames(na.omit(dat[,allvars])), ]

## We are losing a ton of observations by dropping everything that doesn't
## have the lags.  But I'm not using the lags for the matching.

## this is the matching formula
ftmp <- "treated ~ year "

## pull out all the l1vars
l1vars <- c()
for(i in 1:ncol(dat)){
  if(length(grep("l1",names(dat)[i]))>0){
    ## we skip a couple of the variables that have no variation
    if(names(dat)[i]=="l1art22name.2"){next}
    if(names(dat)[i]=="l1colorbit"){next}
    l1vars <- c(l1vars,names(dat)[i])
  }
}

ftmp1 <- as.formula(paste(ftmp, "+",paste(l1vars,collapse=" + ")))
## add the outcome variable so we can na.omit the data
l1vars <- c(l1vars,"bit012345")
ftmp2 <- as.formula(paste(ftmp, "+",paste(l1vars,collapse=" + ")))

ftmp2
names(na.omit(model.frame(ftmp2, dat)))
table(na.omit(model.frame(ftmp2, dat))$treated)
  ## grab these observations to do the na.omit
mframe <- rownames(na.omit(model.frame(ftmp2, dat)))

#dat <- na.omit(dat)
dat <- dat[mframe,]


## compare treated to untreated
t.means <- apply(model.frame(ftmp1, dat[dat$treated==1,]), MAR=2,FUN=mean)
c.means <- apply(model.frame(ftmp1, dat[dat$treated==0,]), MAR=2,FUN=mean)

t.means-c.means

  ## a mahalanobis function
  ## myMH function from http://www.stat.lsa.umich.edu/~bbh/optmatch/doc/mahalanobisMatching.pdf

  ## the second stopifnot condition was in ben hansen's code
  ## but the code seems to work fine for even one variable
  ## and I have samples with just one covariate.
myMH <- function(Tnms, Cnms, inv.cov, data) {
 stopifnot(!is.null(dimnames(inv.cov)[[1]]),# dim(inv.cov)[1] > 1,
 all.equal(dimnames(inv.cov)[[1]], dimnames(inv.cov)[[2]]),
 all(dimnames(inv.cov)[[1]] %in% names(data)))
 covars <- dimnames(inv.cov)[[1]]
 xdiffs <- as.matrix(data[Tnms, covars])
 xdiffs <- xdiffs - as.matrix(data[Cnms, covars])
 rowSums((xdiffs %*% inv.cov) * xdiffs)
}


  ## This is the FAST m-dist function for large data
myMH4 <- function(Tnms, Cnms, inv.cov, data,k){
    ## the basic idea now is that we do the distance calculation
    ## and matching all in one step, so that we don't have to
    ## make a huge distance matrix and then cycle through it.
    ## This also means that as we get fewer and fewer available
    ## control units, the distance calculation should get faster.
  icv <- inv.cov
  mdist <- matrix(NA,length(Tnms),length(Cnms))
  ixnay <- c()
  t0 <- Sys.time()
    ## make some holders
  match.matrix <- matrix(NA, length(Tnms), k)
  rownames(match.matrix) <- Tnms
  dist.matrix <- match.matrix
    ## loop over the match ratio
  for(j in 1:k){
      ## loop over the treated units
    for(i in 1:length(Tnms)){
      if(i%%100==0){
        print(paste("Match",j,"of",k,", T",i,"of",length(Tnms),":",round(Sys.time()-t0,2)))
      }
        ## define a vector of the Cnms that are still available
      elig <- Cnms[(Cnms%in%ixnay)==F]
      #if(i%%500==0){print(paste("treated unit",i,"of",length(Tnms)))}
        ## make the distance vector for treated t and all the eligable
        ## controls
      x <- outer(Tnms[i], elig, FUN = myMH, inv.cov = icv, data = data)
        ## capture the colnames of the ones that have
        ## minimum distances and take the first
      min.x <- min(x)
      m <- (elig[(x==min.x)==T])[1]
        ## add the things to the output matrices
      match.matrix[i,j] <- m
      dist.matrix[i,j] <- min.x
        ## add the new match to ixnay
      ixnay <- c(ixnay, m)
        ## stop the iterations if there are no more control units
      if(length(ixnay)==length(Cnms)){break}
    }
    ## stop the iterations if there are no more control units
    if(length(ixnay)==length(Cnms)){break}
  }
  return(list("match.matrix"=match.matrix,
              "dist.matrix"=dist.matrix))
}

 
mhmatchBig <- function(formula, data, ratio=1){

  ## I think this function has a dependence on "tvar"

    ## pull out the covariates
  X1 <- data
  X2 <- data.frame(model.frame(formula, data))
  tvar <- as.character(formula)[2]
    ## just the covariates
  X3 <- subset(data.frame(model.matrix(formula,data)),select=-1)
  mvrs <- colnames(X3)
    ## covariates and treatment variable
  X <- cbind(X2[,1],X3)
  names(X) <- c(tvar,mvrs)

    ## set the number of matches, 1-to-k
  k <- ratio

    ## if it can't invert, this is where it will fail
  try(icv <- solve(cov(subset(X,select=mvrs))))
  #if(exists("icv")==F){
  #  stop
  #  print("VCOV matrix can't invert with these covariates")
  #}
  stopifnot(exists("icv"))
  trtnms <- row.names(X)[X[[tvar]]==1]
  ctlnms <- row.names(X)[X[[tvar]]==0]
    ## use the data X3 that has just the covariates

    ## run the main internal function
  #try(mydist <- myMH4(trtnms, ctlnms, inv.cov=icv, data=X3,k=k))
  ## Loop over single treated units, running the mahal function
  holder <- list(match.matrix=c(),dist.matrix=c())
  cat("finding matches for a big data set\n")
  pb <- txtProgressBar(min=1,max=length(trtnms),initial = 1, style = 3)
  for(i in 1:length(trtnms)){
    setTxtProgressBar(pb, i)
    #print(i)
    mydist <- myMH4(trtnms[i], ctlnms, inv.cov=icv, data=X3,k=k)
    holder$match.matrix <- rbind(holder$match.matrix,mydist$match.matrix)
    holder$dist.matrix <- rbind(holder$dist.matrix,mydist$dist.matrix)
    ctlnms <- ctlnms[-which(ctlnms %in% mydist$match.matrix[1,1])]    
  }
  close(pb)

  match.matrix <- holder$match.matrix
  dist.matrix <- holder$dist.matrix

  ## make the matched data
    ## first, select all the treated obs
  mh.data <- data[rownames(match.matrix)[is.na(match.matrix[,1])==F],]
    ## then add the control obs
  for(i in 1:ncol(match.matrix)){
    mh.data <- rbind(mh.data,data[na.omit(match.matrix[,i]),])
  }

  ## PROBLEM HERE WITH THE DISTANCES -- NEED TO GET THIS RIGHT

  dd <- c(dist.matrix,dist.matrix)
  names(dd) <- c(rownames(match.matrix),match.matrix)
  mh.data$distance <- dd[rownames(mh.data)]

  mh.data$strata <- c(1:nrow(match.matrix),1:nrow(match.matrix))
#  ## add distances -- mean of all distances for a matched group
#  dd <- apply(dist.matrix, MAR=1, mean, na.rm=T)
#  distance <- matrix(NA,nrow(mh.data),1)
#  rownames(distance) <- rownames(mh.data)
#  distance[names(na.omit(dist.matrix[,1])),] <- dd[names(na.omit(dist.matrix[,1]))]
#  for(i in 1:ncol(dist.matrix)){
#    names(dd) <- match.matrix[,i]
#    distance[na.omit(match.matrix[,i]),] <- dd[na.omit(match.matrix[,i])]
#  }
#  mh.data$distance <- distance
#
#sort(rownames(mh.data))
#
#    ## add weights to the data
#  weights <- matrix(NA,nrow(mh.data),1)
#  rownames(weights) <- rownames(mh.data)
#  weights[names(na.omit(match.matrix[,1])),]<-1
#    ## calculate the weights
#  countf <- function(x){sum(is.na(x))}
#  wts <- 1/(k-apply(match.matrix, MAR=1, countf))
#    ## fill in the weights
#    ## this has me all mixed up, but I think it works
#  for(i in 1:ncol(dist.matrix)){
#    names(wts) <- match.matrix[,i]
#    weights[na.omit(match.matrix[,i]),] <-wts[na.omit(match.matrix[,i])]
#  }
#  mh.data$weights <- weights

  
    ## RETURN
  return(list("match.matrix"=match.matrix, "dist.matrix"=dist.matrix,
              "match.dat"=mh.data))
}

## End of mahalanobis function

ftmp <- ftmp1

set.seed(1234)
mout <- mhmatchBig(ftmp, data = dat)

## I'm writing out the full matched dataset
#setwd("C:\\Users\\Rich\\Desktop\\Papers\\Rewards for Ratification\\bits\\junk")
library(foreign)
write.dta(mout$match.dat,outData)

#####################################################

}

